import json as js
import pandas as pd
import numpy as np
import requests
import matplotlib.pyplot as plt
import pandas_profiling as pp
%matplotlib inline
#loading twitter archiv
twitter_archive = pd.read_csv('twitter-archive-enhanced.csv')
twitter_archive.head()
#loading image prediction
url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
respon = requests.get(url)
with open('image_predictions.tsv', mode='wb') as file:
file.write(respon.content)
image_predictions = pd.read_csv('image_predictions.tsv', sep='\t')
image_predictions.head()
#loading tweets json into list and get only tweet id, retweet count, favorite count
tweet_json = []
with open('tweet-json.txt') as file:
for line in file:
lin = js.loads(line)
tweet_json.append({'tweet_id':lin['id'],
'retweet_count':lin['retweet_count'],
'favorite_count':lin['favorite_count']})
#convert tweet list to dataframe
tweet_json = pd.DataFrame(tweet_json, columns = ['tweet_id', 'retweet_count', 'favorite_count'])
tweet_json.head()
Visual assessment
twitter_archive
image_predictions
tweet_json
programmatic assessment
twitter_archive.describe()
twitter_archive.info()
# i used the pandas profiling library
pp.ProfileReport(twitter_archive)
image_predictions.describe()
image_predictions.info()
# i used the pandas profiling library
pp.ProfileReport(image_predictions)
tweet_json.describe()
tweet_json.info()
# i used the pandas profiling library
pp.ProfileReport(tweet_json)
Quality issues¶
- #### twitter_archive
- some columns not needed
- timestamp wrong type
- some names in the name variable are not accurate
- some value in rating_denominator should be 10s
- some values in rating_numerator should not be more then 10
- from the source variable need to extract the source
- tweet_id is numaric make it str
- remove non tweets wich has valus in the retweet columns
- expanded_urls has some missing values which mean they do not have pictures
- #### image_predictions
- change column names to something descreptive
- tweet_id is numaric make it str
- #### tweet_json
- tweet_id is numaric make it str
Tidiness issues¶
- the columns (doggo, floofer, pupper and puppo) in twitter_archive need to combined in one column
- creat master table to join all 3 tables on tweet_id
- ### Define
- #### twitter_archive
- remove non tweets wich has valus in the retweet columns by querying twitter_archive with out retweet rows
- expanded_urls has some missing values which mean they do not have pictures, removes these rows
- the columns (doggo, floofer, pupper and puppo) in twitter_archive need to combined in one column, joining
- some value in rating_denominator should be 10s
- some values in rating_numerator should not be more then 10
- extracting the source from source variable
- some names in the name variable are not accurate, will replace 'a' with None
- timestamp wrong type
- tweet_id is numaric make it str
- drop the columns are not needed
- #### image_predictions
- change column names to something descreptive
- tweet_id is numaric make it str
- #### tweet_json
- tweet_id is numaric make it str
- #### creat master table
- ## Code
#copy the twitter_archive with out the retweeted rows
twitter_clean = twitter_archive.query('retweeted_status_id == "NaN"')
#drop rows where expanded_urls is empty
twitter_clean = twitter_clean[twitter_clean.expanded_urls.notnull()]
#creat new column which combine the dog stages columns
twitter_clean['stage'] = twitter_clean[['doggo', 'floofer', 'pupper', 'puppo']].max(axis=1)
#fixing rating_numerator
twitter_clean.loc[(twitter_clean.rating_numerator > 10), 'rating_numerator'] = 10
#fixinf rating_denominator
twitter_clean.loc[(twitter_clean.rating_denominator != 10), 'rating_denominator'] = 10
#cleaning source variable
twitter_clean.loc[(twitter_clean.source == '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>'), 'source'] = 'Twitter for iPhone'
twitter_clean.loc[(twitter_clean.source == '<a href="http://vine.co" rel="nofollow">Vine - Make a Scene</a>'), 'source'] = 'Vine - Make a Scene'
twitter_clean.loc[(twitter_clean.source == '<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>'), 'source'] = 'Twitter Web Client'
twitter_clean.loc[(twitter_clean.source == '<a href="https://about.twitter.com/products/tweetdeck" rel="nofollow">TweetDeck</a>'), 'source'] = 'TweetDeck'
#fixinf name values
twitter_clean.loc[(twitter_clean.name == 'a'), 'name'] = 'None'
#changing timestamp to datetime and tweet_id to string
twitter_clean['timestamp'] = pd.to_datetime(twitter_clean['timestamp'])
twitter_clean['tweet_id'] = twitter_clean['tweet_id'].astype(str)
#drop the not needed columns
twitter_clean = twitter_clean.drop(['in_reply_to_status_id', 'in_reply_to_user_id', 'retweeted_status_id', 'retweeted_status_user_id', 'retweeted_status_timestamp','doggo', 'floofer', 'pupper', 'puppo'], axis=1)
#tweet_json table changing tweet_id type
tweet_json['tweet_id'] = tweet_json['tweet_id'].astype(str)
#image_predictions table changing tweet_id type
image_predictions['tweet_id'] = image_predictions['tweet_id'].astype(str)
#image_predictions table changing column names to be usefull
image_predictions = image_predictions.rename(columns={'p1':'prediction1', 'p1_conf':'pre1_confidence', 'p1_dog':'pre1_is_dog',
'p2':'prediction2', 'p2_conf':'pre2_confidence', 'p2_dog':'pre2_is_dog',
'p3':'prediction3', 'p3_conf':'pre3_confidence', 'p3_dog':'pre3_is_dog'})
pp.ProfileReport(twitter_clean)
pp.ProfileReport(tweet_json)
pp.ProfileReport(image_predictions)
#ijoining all 3 tables based on tweet_id
twitter_archive_master = pd.merge(twitter_clean, tweet_json, on='tweet_id', how='inner' )
twitter_archive_master = pd.merge(twitter_archive_master, image_predictions, on='tweet_id', how='inner' )
#assessing the master table
pp.ProfileReport(twitter_archive_master)
- max retweets
- max favorits
- top scources and plot
twitter_archive_master.retweet_count.max()
twitter_archive_master.favorite_count.max()
twitter_archive_master.source.value_counts().sort_values(ascending=True).plot(kind='barh', figsize=(10,6), title='Top tweets sources');
twitter_archive_master.to_csv('twitter_archive_master.csv')